The purpose of the case study is to classify a given silhouette as one of four different types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np
%matplotlib inline
sns.set(style="ticks", color_codes=True)
from scipy.stats import zscore
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import model_selection
vehicle = pd.read_csv('vehicle.csv')
# Shape of Data
vehicle.shape
# Info
vehicle.info()
vehicle.head(15)
# Summary of Numeric Attributes
vehicle.describe().transpose()
vehicle.groupby('class').describe().transpose().unstack(1)
# Check missing values
vehicle.isnull().sum()
cleanVehicle = vehicle.dropna()
cleanVehicle.info()
cleanVehicle.columns
sns.pairplot(cleanVehicle, hue='class')
# Visualize individual Independent variables
fig, ax_arr = plt.subplots(nrows = 18, ncols = 2, figsize = (15,80))
ind_vars = cleanVehicle.columns
plt_row=0
for col in ind_vars:
if (col != 'class'):
# plot boxplot with class as hue
sns.boxplot(y = cleanVehicle[col], x=cleanVehicle['class'], data = cleanVehicle, orient = 'v', ax = ax_arr[plt_row, 0])
#ax_arr[plt_row,0].set_title(col + ' Distribution', fontsize=15)
ax_arr[plt_row,0].set_xlabel('Class', fontsize=10)
ax_arr[plt_row,0].set_ylabel(col, fontsize=10)
ax_arr[plt_row,0].set_title(col + ' Distribution', fontsize=10)
ax_arr[plt_row,0].tick_params(labelsize=10)
sns.distplot( cleanVehicle[cleanVehicle['class'] == 'car'][col], color = 'r', ax = ax_arr[plt_row,1])
sns.distplot( cleanVehicle[cleanVehicle['class'] == 'van'][col], color = 'b', ax = ax_arr[plt_row,1])
sns.distplot( cleanVehicle[cleanVehicle['class'] == 'bus'][col], color = 'g', ax = ax_arr[plt_row,1])
ax_arr[plt_row,1].set_title(col + ' Distribution', fontsize=10)
plt_row+=1
plt.subplots_adjust(wspace=0.5)
plt.tight_layout()
plt.show()
# Distribution of class
ax = sns.countplot(x="class", data=cleanVehicle)
from sklearn import preprocessing
# Convert target class to numeric
label_encoder = preprocessing.LabelEncoder()
# Encode labels in column 'species'.
cleanVehicle['target'] = label_encoder.fit_transform(cleanVehicle['class'])
# Correlation with heat map
corr = cleanVehicle.corr()
#print("Correlation with Target: \n", corr['class'].sort_values(ascending=False))
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(13,7))
# create a mask so we only see the correlation values once
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True
a = sns.heatmap(corr,mask=mask, annot=True, fmt='.2f')
rotx = a.set_xticklabels(a.get_xticklabels(), rotation=90)
roty = a.set_yticklabels(a.get_yticklabels(), rotation=30)
Since machine learning algorithms only take numerical values, all five categorical variables (job, education, default, housing and loan) are transformed into dummy variables.
Dummy variables were used instead of continuous integers because these categorical variables are not ordinal. They simply represent different types rather than levels, so dummy variables are ideal to distinguish the effect of different categories.
cleanVehicle['class'].unique()
#Identify outliers and replace them by median
# for col_name in cleanVehicle.columns[:-1]:
# q1 = cleanVehicle[col_name].quantile(0.25)
# q3 = cleanVehicle[col_name].quantile(0.75)
# iqr = q3 - q1
# low = q1-1.5*iqr
# high = q3+1.5*iqr
# print("column:", col_name, "low:",low, " high:", high)
# #cleanVehicle.loc[(cleanVehicle[col_name] < low) | (cleanVehicle[col_name] > high), col_name] = cleanVehicle[col_name].median()
from sklearn import preprocessing
# Convert target class to numeric
label_encoder = preprocessing.LabelEncoder()
# Encode labels in column 'species'.
Y = cleanVehicle['target']
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
X = cleanVehicle.drop(['class', 'target'], axis=1)
# 30% of the data will be used for testing
test_size= 0.30
seed = 100
# Doing stratified split to maintain target class balance
sk_fold = StratifiedKFold(n_splits=10, shuffle=True)
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed, stratify=Y)
print(x_train.columns)
# Ensure target class maintains balance as the original
print("Original Y: \n", Y.value_counts(normalize=True))
print("y_train: \n", y_train.value_counts(normalize=True))
# stratify=Y ensures data is split in the same ratio as original target class data.
# Scale the Data
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_x_train = sc_X.fit_transform(x_train)
sc_x_test = sc_X.transform(x_test)
## There are several variables that are highly correlated with each other
#Run PCA and plot to visualise the ideal number of components
from sklearn.decomposition import PCA
pca = PCA().fit(sc_x_train)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
np.cumsum(pca.explained_variance_ratio_)
#Based on the plot, we will select 10 components that explain almost 99% of the variance
pca = PCA(n_components=10)
pca.fit(sc_x_train)
#Assign the components to the X variable
pca_x_train = pca.transform(sc_x_train)
pca_x_test = pca.transform(sc_x_test)
from sklearn import model_selection
from sklearn.metrics import f1_score
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(pca_x_train, y_train)
knnpred = knn.predict(pca_x_test)
print(confusion_matrix(y_test, knnpred))
print(round(accuracy_score(y_test, knnpred),2)*100)
KNNCV = (cross_val_score(knn, pca_x_train, y_train, cv=sk_fold, n_jobs=4, scoring = 'accuracy').mean())
print(metrics.classification_report(y_test, knnpred))
from sklearn.svm import SVC
#Grid search to tune model parameters for SVC
from sklearn.model_selection import GridSearchCV
model = SVC()
params = {'C': [0.01, 0.05, 0.5, 1], 'kernel': ['linear', 'rbf']}
model1 = GridSearchCV(model, param_grid=params, verbose=5, n_jobs=4)
model1.fit(pca_x_train, y_train)
print("Best Hyper Parameters:\n", model1.best_params_)
print()
#Build the model with the best hyper parameters
model = SVC(C=1, kernel="rbf", probability=True)
scores = model_selection.cross_val_score(model, pca_x_train, y_train, cv=sk_fold, scoring='accuracy')
print("scores:", scores)
print("mean score:", scores.mean())
SVCCV = scores.mean()
model.fit(pca_x_train, y_train)
svcpred = model.predict(pca_x_test)
print(confusion_matrix(y_test, svcpred))
print(round(accuracy_score(y_test, svcpred),2)*100)
print(metrics.classification_report(y_test, svcpred))
from sklearn.naive_bayes import GaussianNB
gaussiannb= GaussianNB()
gaussiannb.fit(pca_x_train, y_train)
gaussiannbpred = gaussiannb.predict(pca_x_test)
print(confusion_matrix(y_test, gaussiannbpred ))
print(round(accuracy_score(y_test, gaussiannbpred),2)*100)
GAUSIAN = (cross_val_score(gaussiannb, pca_x_train, y_train, cv=sk_fold, n_jobs=4, scoring = 'accuracy').mean())
print(metrics.classification_report(y_test, gaussiannbpred))
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion='gini') #criterion = entopy, gini
dtree.fit(pca_x_train, y_train)
dtreepred = dtree.predict(pca_x_test)
print(confusion_matrix(y_test, dtreepred))
print(round(accuracy_score(y_test, dtreepred),2)*100)
DTREECV = (cross_val_score(dtree, pca_x_train, y_train, cv=sk_fold, n_jobs=4, scoring = 'accuracy').mean())
print(metrics.classification_report(y_test, dtreepred))
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 10, max_depth=10, criterion='gini')#criterion = entopy,gini
rfc.fit(pca_x_train, y_train)
rfcpred = rfc.predict(pca_x_test)
print(confusion_matrix(y_test, rfcpred ))
print(round(accuracy_score(y_test, rfcpred),2)*100)
RFCCV = (cross_val_score(rfc, pca_x_train, y_train, cv=sk_fold, n_jobs=1, scoring = 'accuracy').mean())
print(metrics.classification_report(y_test, rfcpred))
from xgboost import XGBClassifier
xgb = XGBClassifier(learning_rate =0.1, n_estimators=20, max_depth=10,min_child_weight=1,gamma=0,subsample=0.8,
colsample_bytree=0.8, scale_pos_weight=1)
xgb.fit(pca_x_train, y_train)
xgbpred = xgb.predict(pca_x_test)
print(confusion_matrix(y_test, xgbpred ))
print(round(accuracy_score(y_test, xgbpred),2)*100)
XGB = (cross_val_score(estimator = xgb, X = pca_x_train, y = y_train, cv = 10).mean())
from sklearn.ensemble import BaggingClassifier
from sklearn import tree
bgc = BaggingClassifier(tree.DecisionTreeClassifier(random_state=seed), n_estimators=20)
bgc.fit(pca_x_train, y_train)
bgcpred = bgc.predict(pca_x_test)
bgc.score(pca_x_test,y_test)
print(confusion_matrix(y_test, bgcpred ))
print(round(accuracy_score(y_test, bgcpred),2)*100)
BGC = (cross_val_score(estimator = bgc, X = pca_x_train, y = y_train, cv = 10).mean())
from sklearn.ensemble import AdaBoostClassifier
adab = AdaBoostClassifier(random_state=seed, n_estimators=5)
adab.fit(pca_x_train, y_train)
adabpred = adab.predict(pca_x_test)
adab.score(pca_x_test,y_test)
print(confusion_matrix(y_test, adabpred ))
print(round(accuracy_score(y_test, adabpred),2)*100)
ADAB = (cross_val_score(estimator = adab, X = pca_x_train, y = y_train, cv = 10).mean())
from sklearn.ensemble import GradientBoostingClassifier
gbc= GradientBoostingClassifier(n_estimators=10, max_depth=5, random_state=seed)
gbc.fit(pca_x_train, y_train)
gbcpred = gbc.predict(pca_x_test)
print(confusion_matrix(y_test, gbcpred ))
print(round(accuracy_score(y_test, gbcpred),2)*100)
GBC = (cross_val_score(estimator = gbc, X = pca_x_train, y = y_train, cv = 10).mean())
models = pd.DataFrame({
'Models': ['Random Forest Classifier', 'Decision Tree Classifier', 'Support Vector Machine',
'K-Near Neighbors', 'Gausian NB', 'XGBoost', 'Gradient Boosting', 'Adaptive Boosting', 'Bagging Classifier'],
'Score': [RFCCV, DTREECV, SVCCV, KNNCV, GAUSIAN, XGB, GBC, ADAB, BGC]})
models.sort_values(by='Score', ascending=False)
Support Vector Classifier seems to perform the best with an accuracy of almost 95%, followed by XGBoost.